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In [1]: 


import pandas as pd 


In [2]: 


df = pd.read csv('diabetes.csv') 
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In [3]; 
df.head() 
Out[3]: 

Glucose BloodPressure SkinThickness Insulin BMI 
0 148 72 35 0 33.6 
1 85 66 29 0 26.6 
2 183 64 0 0 23.3 
3 89 66 23 94 28.1 
4 137 40 35 168 43.1 
In [4]: 
df.tail() 
Out[4]: 

Glucose BloodPressure SkinThickness Insulin BMI 

768 0 0 0 0.0 
769 0 0 0 0.0 
770 0 0 0 0.0 
771 0 0 0 0.0 
772 0 0 0 0.0 
In [5]; 
df.shape 
Out[5]: 
(773, 8) 
In [6]: 


df.columns 


out[6]: 


Index(['Glucose', 
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'BloodPressure', 


'DiabetesPedigreeFunction', 


dtype-'object') 


DiabetesPedigreeFunction 


0.627 
0.351 
0.672 
0.167 


2.288 


50 
31 
32 
21 
33 


Age Outcome 


1 
0 


DiabetesPedigreeFunction Age Outcome 


'SkinThickness', 'Insulin', 
'Outcome'], 


'Age', 


0.0 
0.0 
0.0 
0.0 


0.0 


'BMI', 


0 
0 
0 
0 
0 


C 
C 
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In [7]: 


df.duplicated().sum() 


out[7]: 


4 


In [8]: 


df.isnull().sum() 


Out[8]: 


Glucose 

BloodPressure 
SkinThickness 

Insulin 

BMI 
DiabetesPedigreeFunction 
Age 

Outcome 


G GGG G G G G 


dtype: int64 


In [9]: 


df .info() 
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«class 'pandas.core.frame.DataFrame'» 
RangeIndex: 773 entries, 0 to 772 


Data columns (total 8 columns): 


# 


O n E WM KA G 1 


M 


Column 

Glucose 

BloodPressure 
SkinThickness 

Insulin 

BMI 
DiabetesPedigreeFunction 
Age 

Outcome 


dtypes: float64(2), int64(6) 
memory usage: 48.4 KB 
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Non-Null Count 


non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 
non-null 


float64 
float64 
int64 
int64 
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In [10]: 


df .describe() 


Out[10]: 


count 
mean 
std 
min 
25% 
50% 
75% 


max 


Glucose BloodPressure SkinThickness 


773.000000 
120.112549 
33.311787 
0.000000 
99.000000 
117.000000 
140.000000 
199.000000 


In [11]: 


df.nunique() 


Out [11]: 


Glucose 
BloodPressure 
SkinThickness 
Insulin 


BMI 


773.000000 
68.658473 
20.073629 

0.000000 
62.000000 
72.000000 
80.000000 


122.000000 


DiabetesPedigreeFunction 


Age 


Outcome 


dtype: 


int64 


In [12]: 


import matplotlib.pyplot as plt 
import seaborn as sns 


In [13]: 


import warnings 


773.000000 


20.403622 


15.985586 


0.000000 


0.000000 


23.000000 


32.000000 


99.000000 


warnings.filterwarnings( 'ignore') 


In [14]: 


df['Outcome'].unique() 


Out[14]: 


array([1, 0], dtype-int64) 
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Insulin 
773.000000 
79.283312 
115.048418 
0.000000 
0.000000 
23.000000 
126.000000 
846.000000 


BMI 
773.000000 
31.785640 
8.267017 
0.000000 
27.200000 
32.000000 
36.500000 
67.100000 


DiabetesPedigreeFunction 
773.000000 

0.468824 

0.332416 

0.000000 

0.240000 

0.370000 

0.624000 

2.420000 


> 
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In [15]: 


df['Outcome'].value counts() 


Out[15]: 
0 505 
1 268 


Name: Outcome, dtype: int64 


In [16]: 


plt.figure(figsize-(15,6)) 
sns.countplot(df['Outcome'], data - df, palette - 'hls') 
plt.show() 


count 


Outcome 


localhost:8888/notebooks/diabetes prediction.ipynb 4/19 


4/4/23, 11:18 PM diabetes prediction - Jupyter Notebook 


In [17]: 


plt.figure(figsize=(30,20)) 

plt.pie(df['Outcome'].value counts(), labels=df[ 'Outcome'].value counts().index, autopct="%1.1 
'color': 'black', 
'weight': 'bold', 
'family': 'serif' }) 

hfont = ('fontname':'serif', 'weight': 'bold'j 

plt.title('Outcome', size-20, **hfont) 

plt.show() 


Outcome 
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In [18]: 


for i in df.columns: 
plt.figure(figsize-(15,6)) 
sns.histplot(df[i], kde = True, bins - 20, palette - 'hls') 
plt.xticks(rotation - 90) 
plt.show() 


120 


w 
In [19]: 
for i in df.columns: 
plt.figure(figsize-(15,6)) 
sns.distplot(df[i], kde - True, bins - 20) 
plt.xticks(rotation - 90) 
plt.show() 
^ 


0.030 


0.025 w 
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In [22]: 


for i in df.columns: 
plt.figure(figsize-(15,6)) 
sns.boxplot(df[i], data - df, palette - 'hls') 
plt.xticks(rotation - 90) 


plt.show() 
4 
' 
EG å G 
Glucose 
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In [24]; 


for i in df.columns: 
for j in df.columns: 
plt.figure(figsize-(15,6)) 
sns.lineplot(x - df[i], y - df[j], data = df, palette - 'hls') 
plt.xticks(rotation - 90) 


plt.show() 
` = ` 2 =] 2 a R 
Glucose ^ 
100 
80 
Ñ 
Z 60 
g 
a 
o 
El 
= 40 
20 
0 
j 9 8 e E g 3 £ 8 
Glucose 


localhost:8888/notebooks/diabetes prediction.ipynb 7119 


4/4/23, 11:18 PM diabetes prediction - Jupyter Notebook 


In [25]: 


for i in df.columns: 
for j in df.columns: 
plt.figure(figsize=(15,6)) 
sns.scatterplot(x = df[i], y = df[j], data = df, palette = 'hls') 
plt.xticks(rotation = 90) 


plt.show() 
^ 
5 
à E 
In [26]: 
df new - df.drop(['Outcome'], axis - 1) 
In [28]: 
for i in df new.columns: 
plt.figure(figsize-(15,6)) 
sns.barplot(x = df['Outcome'], y = df new[i], data - df, ci - None, palette = 'hls') 
plt.xticks(rotation - 90) 
plt.show() 
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In [30]: 


for i in df_new.columns: 
plt.figure(figsize-(15,6)) 


sns.boxplot(x - df['Outcome'], y = df new[i], data - df, palette - 


plt.xticks(rotation - 90) 
plt.show() 


'his') 


Outcome 


120 


100 TL 


In [31]; 


import numpy as np 


In [32]: 


df corr = df.corr() 


In [33]: 
df corr 
out [33]: 
Glucose BloodPressure SkinThickness 
Glucose 1.000000 0.220699 0.084554 
BloodPressure 0.220699 1.000000 0.226704 
SkinThickness 0.084554 0.226704 1.000000 
Insulin 0.332712 0.100708 0.439518 
BMI 0.291421 0.343193 0.403183 
DiabetesPedigreeFunction 0.163684 0.070848 0.193493 
Age 0.310394 0.285733 -0.087681 
Outcome 0.462712 0.078662 0.080283 


4 
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Insulin 
0.332712 
0.100708 
0.439518 
1.000000 
0.205065 
0.189918 

-0.028708 
0.133391 


BMI 
0.291421 
0.343193 
0.403183 
0.205065 
1.000000 
0.168178 
0.102450 
0.296000 


DiabetesPed 
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In [34]: 


plt.figure(figsize=(10, 8)) 

matrix = np.triu(df_corr) 

sns.heatmap(df_corr, annot=True, linewidth=.8, mask=matrix, cmap="rocket"); 
plt.show() 
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In [35]: 
X = df.drop(['Outcome'], axis = 1) 
y = df['Outcome'] 
In [36]: 


from sklearn.preprocessing import StandardScaler 
scaler - StandardScaler() 
X = scaler.fit transform(X) 


In [37]: 


from sklearn.model selection import train test split 
X train, X test, y train, y test = train test split(X, y, test size = 0.25) 
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In [38]: 


from sklearn.linear_model import LogisticRegression 


In [39]: 


model - LogisticRegression() 
model.fit(X train, y train) 


Out [39]: 


iv LogisticRegression į 


LogisticRe ression()| 


In [40]: 


y pred - model.predict(X test) 


In [41]: 


from sklearn.metrics import accuracy score, precision score, recall score, f1 score, confusion 


In [42]: 


# calculate accuracy, precision, recall, and f1-score 
accuracy - accuracy score(y test, y pred) 

precision - precision score(y test, y pred) 

recall - recall score(y test, y pred) 

f1 = f1 score(y test, y pred) 


print("Accuracy: (:.2f)".format(accuracy)) 
print("Precision: (:.2f)".format(precision)) 
print("Recall: (:.2f)".format(recall)) 
print("F1-score: (:.2f)".format(f1)) 


Accuracy: 0.80 
Precision: 0.78 
Recall: 0.56 
F1-score: 0.65 


In [43]: 


# generate a confusion matrix 

cm - confusion matrix(y test, y pred) 
print("Confusion matrix:") 

print(cm) 


Confusion matrix: 
[[120 10] 
[28 236]] 
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In [45]: 


# create a heatmap visualization of the confusion matrix 


sns.heatmap(cm, annot-True, cmap="Blues") 
plt.title("Confusion Matrix") 
plt.xlabel("Predicted Class") 
plt.ylabel("True Class") 

plt.show() 


Confusion Matrix 
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In (461: 


# generate a classification report 

report = classification_report(y_test, y_pred) 
print("Classification report:") 

print(report) 


Classification report: 


precision recall f1-score support 

0 0.81 0.92 0.86 130 

1 0.78 0.56 0.65 64 

accuracy 0.80 194 
macro avg 0.80 0.74 0.76 194 
weighted avg 0.80 0.80 0.79 194 


In [47]: 


from sklearn.tree import DecisionTreeClassifier 


In [48]: 
clf = DecisionTreeClassifier() 


clf.fit(X train, y train) 


Out[48]: 


iv DecisionTreeClassifier | 


iDecisionTree lassifier() 
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In [49]: 


y pred = clf.predict(X test) 


In [50]: 


# calculate accuracy, precision, recall, and f1-score 
accuracy - accuracy score(y test, y pred) 

precision - precision score(y test, y pred) 

recall - recall score(y test, y pred) 

f1 = f1 score(y test, y pred) 


print("Accuracy: (:.2f)".format(accuracy)) 
print("Precision: (:.2f)".format(precision)) 
print("Recall: (:.2f)".format(recall)) 
print("F1-score: (:.2f)".format(f1)) 


Accuracy: 0.68 
Precision: 0.52 
Recall: 0.45 

F1-score: 0.48 


En [51]: 


# generate a confusion matrix 

cm - confusion matrix(y test, y pred) 
print("Confusion matrix:") 

print(cm) 


Confusion matrix: 


[[103 27] 
[35 29]] 
In [52]: 


# create a heatmap visualization of the confusion matrix 
sns.heatmap(cm, annot-True, cmap="Blues") 
plt.title("Confusion Matrix") 

plt.xlabel("Predicted Class") 

plt.ylabel("True Class") 

plt.show() 
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In [53]: 


# generate a classification report 

report = classification_report(y_test, y_pred) 
print("Classification report:") 

print(report) 


Classification report: 


precision recall fi-score support 

0 0.75 0.79 0.77 130 

1 0.52 0.45 0.48 64 

accuracy 0.68 194 
macro avg 0.63 0.62 0.63 194 
weighted avg 0.67 0.68 0.67 194 


In [54]: 


from sklearn.ensemble import RandomForestClassifier 


En [55]: 


clf = RandomForestClassifier(n_estimators=100) 
clf.fit(X train, y train) 


Out[55]: 


Ll RandomFore tClassifier 


iRandomForest lassifier(): 


In [57]: 


# calculate accuracy, precision, recall, and f1-score 
accuracy - accuracy score(y test, y pred) 

precision - precision score(y test, y pred) 

recall - recall score(y test, y pred) 

f1 = f1 score(y test, y pred) 


print("Accuracy: (:.2f)".format(accuracy)) 
print("Precision: (:.2f)".format(precision)) 
print("Recall: (:.2f)".format(recall)) 
print("F1-score: (:.2f)".format(f1)) 


Accuracy: 0.75 
Precision: 0.67 
Recall: 0.48 
F1-score: 0.56 
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In [58]: 


# generate a confusion matrix 

cm - confusion matrix(y test, y pred) 
print("Confusion matrix:") 

print(cm) 


Confusion matrix: 


[[115 15] 
[ 33 31]] 
In (601: 


# create a heatmap visualization of the confusion matrix 
sns.heatmap(cm, annot-True, cmap="Blues") 
plt.title("Confusion Matrix") 

plt.xlabel("Predicted Class") 

plt.ylabel("True Class") 

plt.show() 


Confusion Matrix 
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In (591: 


# generate a classification report 

report = classification_report(y_test, y_pred) 
print("Classification report:") 

print(report) 


Classification report: 


precision recall f1-score support 

0 0.78 0.88 0.83 130 

1 0.67 0.48 0.56 64 

accuracy 0.75 194 
macro avg 0.73 0.68 0.70 194 
weighted avg 0.74 0.75 0.74 194 


In. [61]: 


from sklearn.svm import SVC 
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In [62]: 
clf = SVC(kernel-'linear') 
clf.fit(X train, y train) 


Out[62]: 


VC 


-'linear')| 


In [63]: 


y pred - clf.predict(X test) 


In [64]: 


# calculate accuracy, precision, recall, and f1-score 
accuracy - accuracy score(y test, y pred) 

precision - precision score(y test, y pred) 

recall - recall score(y test, y pred) 

f1 = f1 score(y test, y pred) 


print("Accuracy: (:.2f)".format(accuracy)) 
print("Precision: (:.2f)".format(precision)) 
print("Recall: (:.2f)".format(recall)) 
print("F1-score: (:.2f)".format(f1)) 


Accuracy: 0.78 
Precision: 0.76 
Recall: 0.50 

F1-score: 0.60 


In [65]: 


# generate a confusion matrix 

cm - confusion matrix(y test, y pred) 
print("Confusion matrix:") 

print(cm) 


Confusion matrix: 
[[120 10] 
[32 32]] 
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In [66]: 


# create a heatmap visualization of the confusion matrix 
sns.heatmap(cm, annot-True, cmap="Blues") 
plt.title("Confusion Matrix") 

plt.xlabel("Predicted Class") 

plt.ylabel("True Class") 

plt.show() 


Confusion Matrix 
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In [67]: 


# generate a classification report 

report = classification_report(y_test, y_pred) 
print("Classification report:") 

print(report) 


Classification report: 


precision recall f1-score support 

0 0.79 9.92 0.85 130 

1 0.76 0.50 0.60 64 

accuracy 0.78 194 
macro avg 0.78 8.71 0.73 194 
weighted avg 0.78 0.78 0.77 194 


In [68]: 


from sklearn.model selection import GridSearchCV, train test split 


In [69]: 


param grid - ('C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'degree': [2, 3, 4] 


In [70]: 


svc - SVC() 
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In [71]: 


grid search = GridSearchCV(estimator-svc, param_grid=param_grid, cv=5) 


In: [72]; 


grid_search.fit(X_train, y_train) 


Out [72]: 


>» GridSearchCV 


In [73]: 


best params - grid search.best params 
best score = grid search.best score 


print("Best parameters:", best params) 
print("Best score:", best score) 


Best parameters: ('C': 1, 'degree': 2, 'kernel': 'rbf') 
Best score: 0.7668515742128935 


In [76]: 


# Train Decision Tree classifier 

dt = DecisionTreeClassifier() 
dt.fit(X_train, y_train) 

dt_pred = dt.predict(X_test) 

dt_acc = accuracy_score(y_test, dt_pred) 


# Train Random Forest classifier 

rf = RandomForestClassifier() 

rf.fit(X_ train, y_train) 

rf_pred = rf.predict(X_test) 

rf acc = accuracy_score(y_test, rf_pred) 


# Train SVM cLassifier 

svm = SVC(C = 1, degree = 2, kernel = 'rbf') 
svm.fit(X train, y train) 

svm pred - svm.predict(X test) 

svm acc - accuracy score(y test, svm pred) 


En [77]: 
models - ['Decision Tree', 'Random Forest', 'SVM'] 


accuracies - [dt acc, rf acc, svm acc] 


In [78]: 


x pos = np.arange(len(models)) 
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In [79]: 


plt.bar(x_pos, accuracies, align='center', alpha=0.5) 
plt.xticks(x_pos, models) 

plt.ylabel('Accuracy') 

plt.title('Model Comparison') 

plt.show() 


Model Comparison 


Accuracy 


Decision Tree Random Forest SVM 
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